import pandas as pd
import numpy as np
import seaborn as sns
from chart_studio import plotly as py
import plotly.graph_objs as go
from plotly import tools
import matplotlib.pyplot as plt
sns.set(style="whitegrid")
%matplotlib inline
import datetime, warnings, scipy
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import ConnectionPatch
from collections import OrderedDict
from matplotlib.gridspec import GridSpec
from mpl_toolkits.basemap import Basemap
from scipy.optimize import curve_fit
plt.rcParams["patch.force_edgecolor"] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor = 'dimgray', linewidth=1)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
pd.options.display.max_columns = 50
warnings.filterwarnings("ignore")
airlines = pd.read_csv('data/airlines.csv')
airports= pd.read_csv('data/airports.csv')
flights = pd.read_csv('data/flights.csv', low_memory=False)
flights_2019 = pd.read_csv('data/flights_sample_3m_renamed.csv', low_memory=False)
print (airlines.shape)
print (airports.shape)
print (flights.shape)
print (flights_2019.shape)
(14, 2) (322, 7) (5819079, 31) (3000000, 30)
Each entry of the flights.csv & flights_2019.csv files represents a flight with 5800000+ flights in 2015 and 3000000 in 2019. These flights are described according to 31/30 (2019 doesn't contain the tail_number column) variables. YEAR, MONTH, DAY, DAY_OF_WEEK: dates of the flight
- AIRLINE: identification number assigned by the US government to identify a unique airline
- SCHEDULED_DEPARTURE and SCHEDULED_ARRIVAL : scheduled/approximated times of taking off and landing
- DEPARTURE_TIME and ARRIVAL_TIME: real times at which the previously mentioned variables happened
- DEPARTURE_DELAY and ARRIVAL_DELAY: difference (in minutes) between planned and real times
- DISTANCE: distance (in miles)
Exploring if cancelations reasons match up with cancelation numbers¶
print (flights.shape[0]*flights.CANCELLED.mean())
print (flights.shape[0] - flights.CANCELLATION_REASON.isnull().sum().sum())
89884.0 89884
print (flights_2019.shape[0]*flights_2019.CANCELLED.mean())
print (flights_2019.shape[0] - flights_2019.CANCELLATION_REASON.isnull().sum().sum())
79140.0 79140
The numbers fit, so let's check if only the cancelled flights have a cancellation reason¶
print (flights['ARRIVAL_DELAY'][flights['ARRIVAL_DELAY'] >= 15].count())
print (flights.shape[0] - flights.AIR_SYSTEM_DELAY.isnull().sum().sum())
1063439 1063439
print (flights_2019['ARRIVAL_DELAY'][flights_2019['ARRIVAL_DELAY'] >= 15].count())
print (flights_2019.shape[0] - flights_2019.AIR_SYSTEM_DELAY.isnull().sum().sum())
533863 533863
We can now see that only real delays are in both of the data sets ( >=15 minute & with delay code )
def plot_airport_flights(airports, flights, year):
count_flights = flights['ORIGIN_AIRPORT'].value_counts()
colors = ['lightyellow', 'yellow', 'orange', 'orangered', 'red']
size_limits = [1, 100, 1000, 10000, 100000, 1000000]
labels = [f"{size_limits[i]} <.< {size_limits[i+1]}" for i in range(len(size_limits) - 1)]
fig, ax = plt.subplots(figsize=(11, 11))
# Create Basemap instance
map = Basemap(resolution='i', llcrnrlon=-180, urcrnrlon=-50,
llcrnrlat=10, urcrnrlat=75, lat_0=0, lon_0=0, ax=ax)
map.shadedrelief()
map.drawcoastlines()
map.drawcountries(linewidth=3)
map.drawstates(color='0.3')
# Plot airports on the map
for _, (code, lat, lon) in airports[['IATA_CODE', 'LATITUDE', 'LONGITUDE']].iterrows():
x, y = map(lon, lat)
flight_count = count_flights.get(code, 0)
isize = [i for i, val in enumerate(size_limits) if val < flight_count]
ind = isize[-1] if isize else 0
map.plot(x, y, 'o', markersize=ind + 5, markeredgewidth=1,
color=colors[ind], markeredgecolor='k', label=labels[ind])
# Remove duplicate labels and set their order
handles, labels = ax.get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
key_order = ('1 <.< 100', '100 <.< 1000', '1000 <.< 10000',
'10000 <.< 100000', '100000 <.< 1000000')
new_label = OrderedDict((key, by_label[key]) for key in key_order if key in by_label)
ax.legend(new_label.values(), new_label.keys(), loc='upper right', prop={'size': 11},
title='Number of flights per year', frameon=True, framealpha=1)
plt.show()
plot_airport_flights(airports, flights_2019, 2019)
plot_airport_flights(airports, flights, 2015)
Bringing DataFrames period to DATETIME format¶
flights['DATE'] = pd.to_datetime(flights[['YEAR','MONTH', 'DAY']])
flights_2019['DATE'] = pd.to_datetime(flights_2019[['YEAR','MONTH', 'DAY']])
The SCHEDULED_DEPARTURE column, is represented by 4 digit numbers, first two representing the hour and last two, the minutes. This format is not convenient, thus we need to convert it, and merge it flight date.
def format_hour(chaine):
if pd.isnull(chaine):
return np.nan
else:
if chaine == 2400: chaine = 0
chaine = "{0:04d}".format(int(chaine))
hour = datetime.time(int(chaine[0:2]), int(chaine[2:4]))
return hour
# Function that combines a date and time to produce a datetime.datetime
def combine_date_hour(x):
if pd.isnull(x[0]) or pd.isnull(x[1]):
return np.nan
else:
return datetime.datetime.combine(x[0],x[1])
#_______________________________________________________________________________
# Function that combine two columns of the dataframe to create a datetime format
def create_flight_time(df, col):
list = []
for index, cols in df[['DATE', col]].iterrows():
if pd.isnull(cols[1]):
list.append(np.nan)
elif float(cols[1]) == 2400:
cols[0] += datetime.timedelta(days=1)
cols[1] = datetime.time(0,0)
list.append(combine_date_hour(cols))
else:
cols[1] = format_hour(cols[1])
list.append(combine_date_hour(cols))
return pd.Series(list)
# TAKES LONG TO RUN
flights['SCHEDULED_DEPARTURE'] = create_flight_time(flights, 'SCHEDULED_DEPARTURE')
flights['DEPARTURE_TIME'] = flights['DEPARTURE_TIME'].apply(format_hour)
flights['SCHEDULED_ARRIVAL'] = flights['SCHEDULED_ARRIVAL'].apply(format_hour)
flights['ARRIVAL_TIME'] = flights['ARRIVAL_TIME'].apply(format_hour)
# TAKES LONG TO RUN
flights_2019['SCHEDULED_DEPARTURE'] = create_flight_time(flights_2019, 'SCHEDULED_DEPARTURE')
flights_2019['DEPARTURE_TIME'] = flights_2019['DEPARTURE_TIME'].apply(format_hour)
flights_2019['SCHEDULED_ARRIVAL'] = flights_2019['SCHEDULED_ARRIVAL'].apply(format_hour)
flights_2019['ARRIVAL_TIME'] = flights_2019['ARRIVAL_TIME'].apply(format_hour)
# saving csv in case of something (creating these takes a lot of time)
flights.to_csv('flights', encoding='utf-8', index=False)
flights_2019.to_csv('flights_2019', encoding='utf-8', index=False)
flights = pd.read_csv('flights', low_memory=False)
flights_2019 = pd.read_csv('flights_2019', low_memory=False)
The DEPARTURE_TIME and ARRIVAL_TIME variables can be misleading because they don't include dates. For example, the first entry in the dataframe shows a scheduled departure at 00:05 on January 1st. The DEPARTURE_TIME is listd as 23:54, so it's unclear if the flight left early or had a large delay. Therefore, the DEPARTURE_DELAY and ARRIVAL_DELAY variables are more useful since they directly show the delays in minutes. Thus, we will not use the DEPARTURE_TIME and ARRIVAL_TIME variables in the following analysis.
variables_to_remove = ['TAXI_OUT', 'TAXI_IN', 'WHEELS_ON', 'WHEELS_OFF', 'YEAR',
'MONTH','DAY','DAY_OF_WEEK','DATE', 'AIR_SYSTEM_DELAY',
'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
'WEATHER_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
'FLIGHT_NUMBER', 'TAIL_NUMBER', 'AIR_TIME']
flights.drop(variables_to_remove, axis = 1, inplace = True)
flights = flights[['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY',
'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY',
'SCHEDULED_TIME', 'ELAPSED_TIME']]
flights[:5]
| AIRLINE | ORIGIN_AIRPORT | DESTINATION_AIRPORT | SCHEDULED_DEPARTURE | DEPARTURE_TIME | DEPARTURE_DELAY | SCHEDULED_ARRIVAL | ARRIVAL_TIME | ARRIVAL_DELAY | SCHEDULED_TIME | ELAPSED_TIME | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AS | ANC | SEA | 2015-01-01 00:05:00 | 23:54:00 | -11.0 | 04:30:00 | 04:08:00 | -22.0 | 205.0 | 194.0 |
| 1 | AA | LAX | PBI | 2015-01-01 00:10:00 | 00:02:00 | -8.0 | 07:50:00 | 07:41:00 | -9.0 | 280.0 | 279.0 |
| 2 | US | SFO | CLT | 2015-01-01 00:20:00 | 00:18:00 | -2.0 | 08:06:00 | 08:11:00 | 5.0 | 286.0 | 293.0 |
| 3 | AA | LAX | MIA | 2015-01-01 00:20:00 | 00:15:00 | -5.0 | 08:05:00 | 07:56:00 | -9.0 | 285.0 | 281.0 |
| 4 | AS | SEA | ANC | 2015-01-01 00:25:00 | 00:24:00 | -1.0 | 03:20:00 | 02:59:00 | -21.0 | 235.0 | 215.0 |
variables_to_remove = ['TAXI_OUT', 'TAXI_IN', 'WHEELS_ON', 'WHEELS_OFF', 'YEAR',
'MONTH','DAY','DAY_OF_WEEK','DATE', 'AIR_SYSTEM_DELAY',
'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
'WEATHER_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
'FLIGHT_NUMBER', 'AIR_TIME']
flights_2019.drop(variables_to_remove, axis = 1, inplace = True)
flights_2019 = flights_2019[['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY',
'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY',
'SCHEDULED_TIME', 'ELAPSED_TIME']]
flights_2019[:5]
| AIRLINE | ORIGIN_AIRPORT | DESTINATION_AIRPORT | SCHEDULED_DEPARTURE | DEPARTURE_TIME | DEPARTURE_DELAY | SCHEDULED_ARRIVAL | ARRIVAL_TIME | ARRIVAL_DELAY | SCHEDULED_TIME | ELAPSED_TIME | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | UA | FLL | EWR | 2019-01-09 11:55:00 | 11:51:00 | -4.0 | 15:01:00 | 14:47:00 | -14.0 | 186.0 | 176.0 |
| 1 | DL | MSP | SEA | 2022-11-19 21:20:00 | 21:14:00 | -6.0 | 23:15:00 | 23:10:00 | -5.0 | 235.0 | 236.0 |
| 2 | UA | DEN | MSP | 2022-07-22 09:54:00 | 10:00:00 | 6.0 | 12:52:00 | 12:52:00 | 0.0 | 118.0 | 112.0 |
| 3 | DL | MSP | SFO | 2023-03-06 16:09:00 | 16:08:00 | -1.0 | 18:29:00 | 18:53:00 | 24.0 | 260.0 | 285.0 |
| 4 | NK | MCO | DFW | 2020-02-23 18:40:00 | 18:38:00 | -2.0 | 20:41:00 | 20:40:00 | -1.0 | 181.0 | 182.0 |
Examining how complete the data is:¶
missing_flights = flights.isnull().sum(axis=0).reset_index()
missing_flights.columns = ['variable', 'missing values']
missing_flights['filling factor (%)']=(flights.shape[0]-missing_flights['missing values'])/flights.shape[0]*100
missing_flights.sort_values('filling factor (%)').reset_index(drop = True)
| variable | missing values | filling factor (%) | |
|---|---|---|---|
| 0 | ARRIVAL_DELAY | 105071 | 98.194371 |
| 1 | ELAPSED_TIME | 105071 | 98.194371 |
| 2 | ARRIVAL_TIME | 92513 | 98.410178 |
| 3 | DEPARTURE_TIME | 86153 | 98.519474 |
| 4 | DEPARTURE_DELAY | 86153 | 98.519474 |
| 5 | SCHEDULED_TIME | 6 | 99.999897 |
| 6 | AIRLINE | 0 | 100.000000 |
| 7 | ORIGIN_AIRPORT | 0 | 100.000000 |
| 8 | DESTINATION_AIRPORT | 0 | 100.000000 |
| 9 | SCHEDULED_DEPARTURE | 0 | 100.000000 |
| 10 | SCHEDULED_ARRIVAL | 0 | 100.000000 |
missing_flights_2019 = flights_2019.isnull().sum(axis=0).reset_index()
missing_flights_2019.columns = ['variable', 'missing values']
missing_flights_2019['filling factor (%)']=(flights_2019.shape[0]-missing_flights_2019['missing values'])/flights_2019.shape[0]*100
missing_flights_2019.sort_values('filling factor (%)').reset_index(drop = True)
| variable | missing values | filling factor (%) | |
|---|---|---|---|
| 0 | ARRIVAL_DELAY | 86198 | 97.126733 |
| 1 | ELAPSED_TIME | 86198 | 97.126733 |
| 2 | ARRIVAL_TIME | 79942 | 97.335267 |
| 3 | DEPARTURE_DELAY | 77644 | 97.411867 |
| 4 | DEPARTURE_TIME | 77615 | 97.412833 |
| 5 | SCHEDULED_TIME | 14 | 99.999533 |
| 6 | AIRLINE | 0 | 100.000000 |
| 7 | ORIGIN_AIRPORT | 0 | 100.000000 |
| 8 | DESTINATION_AIRPORT | 0 | 100.000000 |
| 9 | SCHEDULED_DEPARTURE | 0 | 100.000000 |
| 10 | SCHEDULED_ARRIVAL | 0 | 100.000000 |
abbr_companies = airlines.set_index('IATA_CODE')['AIRLINE'].to_dict()
airlines
| IATA_CODE | AIRLINE | |
|---|---|---|
| 0 | UA | United Air Lines Inc. |
| 1 | AA | American Airlines Inc. |
| 2 | US | US Airways Inc. |
| 3 | F9 | Frontier Airlines Inc. |
| 4 | B6 | JetBlue Airways |
| 5 | OO | Skywest Airlines Inc. |
| 6 | AS | Alaska Airlines Inc. |
| 7 | NK | Spirit Air Lines |
| 8 | WN | Southwest Airlines Co. |
| 9 | DL | Delta Air Lines Inc. |
| 10 | EV | Atlantic Southeast Airlines |
| 11 | HA | Hawaiian Airlines Inc. |
| 12 | MQ | American Eagle Airlines Inc. |
| 13 | VX | Virgin America |
def get_stats(group):
return {'min': group.min(), 'max': group.max(),
'count': group.count(), 'mean': group.mean()}
#_______________________________________________________________
# Creation of a dataframe with statitical infos on each airline:
global_stats_2015 = flights['DEPARTURE_DELAY'].groupby(flights['AIRLINE']).apply(get_stats).unstack()
global_stats_2015 = global_stats_2015.sort_values('count')
global_stats_2015
| min | max | count | mean | |
|---|---|---|---|---|
| AIRLINE | ||||
| VX | -24.0 | 644.0 | 61385.0 | 9.022595 |
| HA | -27.0 | 1433.0 | 76119.0 | 0.485713 |
| F9 | -46.0 | 1112.0 | 90290.0 | 13.350858 |
| NK | -37.0 | 836.0 | 115454.0 | 15.944766 |
| AS | -82.0 | 963.0 | 171910.0 | 1.785801 |
| US | -35.0 | 759.0 | 194825.0 | 6.141137 |
| B6 | -31.0 | 1006.0 | 262843.0 | 11.514353 |
| MQ | -36.0 | 1544.0 | 280282.0 | 10.125188 |
| UA | -40.0 | 1314.0 | 509534.0 | 14.435441 |
| EV | -55.0 | 1274.0 | 557294.0 | 8.715934 |
| OO | -56.0 | 1378.0 | 579086.0 | 7.801104 |
| AA | -68.0 | 1988.0 | 715598.0 | 8.900856 |
| DL | -61.0 | 1289.0 | 872177.0 | 7.369254 |
| WN | -28.0 | 665.0 | 1246129.0 | 10.581986 |
global_stats_2019 = flights_2019['DEPARTURE_DELAY'].groupby(flights_2019['AIRLINE']).apply(get_stats).unstack()
global_stats_2019 = global_stats_2019.sort_values('count')
global_stats_2019
| min | max | count | mean | |
|---|---|---|---|---|
| AIRLINE | ||||
| EV | -54.0 | 1839.0 | 18028.0 | 12.774462 |
| QX | -38.0 | 754.0 | 20284.0 | 4.832775 |
| HA | -31.0 | 1615.0 | 31730.0 | 5.089537 |
| G4 | -51.0 | 1778.0 | 50367.0 | 13.907797 |
| YV | -56.0 | 2036.0 | 62686.0 | 12.277383 |
| F9 | -46.0 | 1375.0 | 62846.0 | 16.033574 |
| NK | -47.0 | 1302.0 | 93433.0 | 12.981880 |
| AS | -73.0 | 771.0 | 98616.0 | 4.640018 |
| OH | -43.0 | 1919.0 | 103854.0 | 7.972124 |
| B6 | -62.0 | 1834.0 | 109882.0 | 18.322555 |
| 9E | -90.0 | 2579.0 | 110140.0 | 5.951135 |
| MQ | -33.0 | 1543.0 | 117758.0 | 6.728256 |
| YX | -66.0 | 1352.0 | 138611.0 | 5.804359 |
| UA | -82.0 | 1451.0 | 249083.0 | 11.217032 |
| OO | -68.0 | 2327.0 | 336153.0 | 9.458294 |
| AA | -89.0 | 2966.0 | 372441.0 | 12.609895 |
| DL | -49.0 | 1244.0 | 389339.0 | 8.106748 |
| WN | -52.0 | 648.0 | 557105.0 | 10.816957 |
# Set font properties
mpl.rc('font', family='Arial', weight='bold', size=15)
# Define colors
base_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
'#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# Extend or repeat colors to match the number of airlines
colors_pie = (base_colors * ((len(global_stats_2015) // len(base_colors)) + 1))[:len(global_stats_2015)]
# Create figure and subplots for pie charts
fig = plt.figure(figsize=(16, 15))
gs = GridSpec(2, 2, figure=fig)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])
# Pie chart 1: Number of flights
ax1.pie(global_stats_2015['count'], labels=global_stats_2015.index, colors=colors_pie,
autopct='%1.0f%%', startangle=0)
ax1.set_title('% of flights per company', color='b', fontsize=18)
# Pie chart 2: Mean delay at departure
ax2.pie([max(s, 0) for s in global_stats_2015['mean']], labels=global_stats_2015.index, colors=colors_pie,
autopct=lambda p: '{:.0f}'.format(p * sum(global_stats_2015['mean']) / 100), startangle=0)
ax2.set_title('Mean delay at origin', color='b', fontsize=18)
# Adjust layout for pie charts
plt.tight_layout(w_pad=3)
plt.show()
# Set font properties
mpl.rc('font', family='Arial', weight='bold', size=15)
# Define colors
base_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
'#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# Extend or repeat colors to match the number of airlines
colors_pie = (base_colors * ((len(global_stats_2019) // len(base_colors)) + 1))[:len(global_stats_2019)]
# Create figure and subplots for pie charts
fig = plt.figure(figsize=(16, 15))
gs = GridSpec(2, 2, figure=fig)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])
# Pie chart 1: Number of flights
ax1.pie(global_stats_2019['count'], labels=global_stats_2019.index, colors=colors_pie,
autopct='%1.0f%%', startangle=0)
ax1.set_title('% of flights per company', color='b', fontsize=18)
# Pie chart 2: Mean delay at departure
ax2.pie([max(s, 0) for s in global_stats_2019['mean']], labels=global_stats_2019.index, colors=colors_pie,
autopct=lambda p: '{:.0f}'.format(p * sum(global_stats_2019['mean']) / 100), startangle=0)
ax2.set_title('Mean delay at origin', color='b', fontsize=18)
# Adjust layout for pie charts
plt.tight_layout(w_pad=3)
plt.show()
df2 = flights.loc[:, ['AIRLINE', 'DEPARTURE_DELAY']]
df2['AIRLINE'] = df2['AIRLINE'].replace(abbr_companies)
# Define colors for strip plot
base_colors_strip = ['#d62728', '#ffbb78', '#ff9896', '#2ca02c', '#1f77b4',
'#98df8a', '#7f7f7f', '#9467bd', '#ff7f0e', '#e377c2',
'#8c564b', '#bcbd22', '#aec7e8', '#17becf']
# Extend or repeat colors to match the number of airlines
colors_strip = (base_colors_strip * ((len(global_stats_2015) // len(base_colors_strip)) + 1))[:len(global_stats_2015)]
# Strip plot: Departure delays
fig, ax3 = plt.subplots(figsize=(16, 7.5))
sns.stripplot(y="AIRLINE", x="DEPARTURE_DELAY", data=df2, size=4, palette=colors_strip,
ax=ax3, jitter=True, linewidth=0.5)
ax3.set_xticklabels(['{:2.0f}h{:2.0f}m'.format(*divmod(x, 60)) for x in ax3.get_xticks()], fontsize=14)
ax3.set_yticklabels(ax3.get_yticklabels(), fontsize=14)
ax3.set_xlabel('Departure delay', fontsize=18, bbox={'facecolor':'midnightblue', 'pad':5}, color='w', labelpad=20)
ax3.yaxis.label.set_visible(False)
# Adjust layout for strip plot
plt.tight_layout()
plt.show()
df2 = flights_2019.loc[:, ['AIRLINE', 'DEPARTURE_DELAY']]
df2['AIRLINE'] = df2['AIRLINE'].replace(abbr_companies)
# Define colors for strip plot
base_colors_strip = ['#d62728', '#ffbb78', '#ff9896', '#2ca02c', '#1f77b4',
'#98df8a', '#7f7f7f', '#9467bd', '#ff7f0e', '#e377c2',
'#8c564b', '#bcbd22', '#aec7e8', '#17becf']
# Extend or repeat colors to match the number of airlines
colors_strip = (base_colors_strip * ((len(global_stats_2019) // len(base_colors_strip)) + 1))[:len(global_stats_2019)]
# Strip plot: Departure delays
fig, ax3 = plt.subplots(figsize=(16, 7.5))
sns.stripplot(y="AIRLINE", x="DEPARTURE_DELAY", data=df2, size=4, palette=colors_strip,
ax=ax3, jitter=True, linewidth=0.5)
ax3.set_xticklabels(['{:2.0f}h{:2.0f}m'.format(*divmod(x, 60)) for x in ax3.get_xticks()], fontsize=14)
ax3.set_yticklabels(ax3.get_yticklabels(), fontsize=14)
ax3.set_xlabel('Departure delay', fontsize=18, bbox={'facecolor':'midnightblue', 'pad':5}, color='w', labelpad=20)
ax3.yaxis.label.set_visible(False)
# Adjust layout for strip plot
plt.tight_layout()
plt.show()
Considering the first pie chart showing the percentage of flights per airline, we observe some disparity among carriers. For example, Southwest Airlines (WN) makes up about 20% of the flights, which is comparable to the combined total of the eight smallest airlines.
In contrast, the second pie chart shows less variation among airlines. Excluding some outliers, such as Hawaiian Airlines and Alaska Airlines which have very low mean delays, a delay of about 11 ± 7 minutes represents the average delays accurately. This low average suggests that most airlines generally adhere to their schedules.
The final figures provides an overview of all delays recorded. It highlights the dispersion of data and contrasts the relative uniformity seen in the second pie chart. While the average delays hover around 10 minutes, showing that most flights depart on time, there are occasional significant delays that can last several hours.
def plot_delay_distribution(df, delay_column, airline_column, abbr_companies):
# Function to define how delays are grouped
def delay_type(x):
if x > 45:
return 2
elif x > 5:
return 1
else:
return 0
# Apply delay grouping
df['DELAY_LEVEL'] = df[delay_column].apply(delay_type)
# Define plot colors
delay_colors = ['#2ca02c', '#ff7f0e', '#d62728']
# Create figure and axis for bar plot
fig, ax = plt.subplots(figsize=(14, 10))
# Create bar plot with vertical orientation
sns.barplot(
x=airline_column, y=delay_column, hue='DELAY_LEVEL',
data=df, ax=ax, palette=delay_colors, ci=None,
estimator=lambda x: len(x) / len(df) * 100 # Plot percentage of flights
)
# Replace abbreviations with full names
labels = [abbr_companies.get(item.get_text(), item.get_text()) for item in ax.get_xticklabels()]
ax.set_xticklabels(labels, rotation=90)
# Customize plot appearance
plt.setp(ax.get_xticklabels(), fontsize=12, weight='bold')
plt.setp(ax.get_yticklabels(), fontsize=12, weight='normal')
ax.xaxis.label.set_visible(False)
ax.set_ylabel('Percentage of Flights (%)', fontsize=16, weight='bold', labelpad=10)
# Customize legend
legend_labels = ['On time (t ≤ 5 min)', 'Small delay (5 < t ≤ 45 min)', 'Large delay (t > 45 min)']
L = ax.legend(title='Delay Level')
for i, text in enumerate(L.get_texts()):
text.set_text(legend_labels[i])
# Adjust layout to prevent clipping
plt.tight_layout()
plt.show()
plot_delay_distribution(
df=flights,
delay_column='DEPARTURE_DELAY',
airline_column='AIRLINE',
abbr_companies=abbr_companies
)
plot_delay_distribution(
df=flights_2019,
delay_column='DEPARTURE_DELAY',
airline_column='AIRLINE',
abbr_companies=abbr_companies
)
def plot_delay_histograms(dataframe, statistics, abbr_companies):
# Model function used to fit the histograms
def func(x, a, b):
return a * np.exp(-x / b)
# Prepare data
df2 = dataframe[['AIRLINE', 'DEPARTURE_DELAY']].replace({'AIRLINE': abbr_companies})
# Initialize figure
fig, axes = plt.subplots(5, 3, figsize=(11, 11), constrained_layout=True)
# Flatten axes for easy iteration
axes = axes.flatten()
# Iterate over each airline and corresponding axis
for i, (ax, carrier_code) in enumerate(zip(axes, statistics.index)):
carrier_name = abbr_companies.get(carrier_code, carrier_code)
delays = df2[df2['AIRLINE'] == carrier_name]['DEPARTURE_DELAY']
# Fit the distribution
n, bins, _ = ax.hist(delays, range=(15, 180), density=True, bins=60, alpha=0.75, color='blue')
bin_centers = (bins[:-1] + bins[1:]) / 2
popt, _ = curve_fit(func, bin_centers, n, p0=[1, 2])
# Plot the fit curve
ax.plot(bin_centers, func(bin_centers, *popt), 'r-', linewidth=2)
# Set title and tick labels
ax.set_title(carrier_name, fontsize=14, fontweight='bold', color='darkblue')
# Set tick labels
if i >= len(axes) - 3: # Last row
ax.set_xticklabels([f'{int(x // 60)}h{int(x % 60)}m' for x in ax.get_xticks()], rotation=45)
else:
ax.set_xticklabels([])
# Axis labels
if i % 3 == 0: # First column
ax.set_ylabel('Normalized count of flights', fontsize=12, rotation=90)
if i >= len(axes) - 3: # Last row
ax.set_xlabel('Delay at origin', fontsize=12)
# Display fit parameters
ax.text(0.68, 0.7, f'a = {popt[0]:.2f}\nb = {popt[1]:.1f}',
transform=ax.transAxes, fontsize=12, bbox={'facecolor': 'tomato', 'alpha': 0.8, 'pad': 5})
# Adjust layout
plt.show()
plot_delay_histograms(flights, global_stats_2015, abbr_companies)
plot_delay_histograms(flights_2019, global_stats_2019, abbr_companies)
Occasionally, large delays can be registered. In this section, we examine in more detail the distribution of delays for every airline:
This figure illustrates the normalized distribution of delays that we have modeled with an exponential distribution $f(x) = a \cdot e^{-\frac{x}{b}}$. The parameters 'a' and 'b' obtained to describe each airline are given in the upper right corner of each panel. Note that the normalization of the distribution implies that $\int f(x) \, dx \approx 1$. Here, we do not have a strict equality since the normalization applies to the histograms but not to the model function. However, this relation entails that the 'a' and 'b' coefficients will be correlated with $a \propto \frac{1}{b}$, and hence, only one of these two values is necessary to describe the distributions. Finally, according to the value of either 'a' or 'b', it is possible to establish a ranking of the companies: low values of 'a' correspond to airlines with a large proportion of significant delays, and conversely, airlines that excel in punctuality will have high 'a' values.
Delays: take-off or landing?¶
These figures shows that arrival delays are usually shorter than departure delays, suggesting airlines speed up to minimize arrival delays.
# Reverse mapping of abbr_companies
full_to_abbr = {v: k for k, v in abbr_companies.items()}
# Update matplotlib parameters
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams['hatch.linewidth'] = 2.0
# Create figure
fig = plt.figure(1, figsize=(11, 6))
# Plot bar charts
ax = sns.barplot(x="DEPARTURE_DELAY", y="AIRLINE", data=flights, color="lightskyblue", ci=None)
ax = sns.barplot(x="ARRIVAL_DELAY", y="AIRLINE", data=flights, color="r", hatch='///',
alpha=0.0, ci=None)
# Update y-tick labels with abbreviations
labels = [full_to_abbr.get(item.get_text(), item.get_text()) for item in ax.get_yticklabels()]
ax.set_yticklabels(labels)
# Customize plot
ax.yaxis.label.set_visible(False)
plt.xlabel('Mean delay [min] (@departure: blue, @arrival: hatch lines)',
fontsize=14, weight='bold', labelpad=10)
# Show plot
plt.show()
# Reverse mapping of abbr_companies
full_to_abbr = {v: k for k, v in abbr_companies.items()}
# Update matplotlib parameters
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams['hatch.linewidth'] = 2.0
# Create figure
fig = plt.figure(1, figsize=(11, 6))
# Plot bar charts
ax = sns.barplot(x="DEPARTURE_DELAY", y="AIRLINE", data=flights_2019, color="lightskyblue", ci=None)
ax = sns.barplot(x="ARRIVAL_DELAY", y="AIRLINE", data=flights_2019, color="r", hatch='///',
alpha=0.0, ci=None)
# Update y-tick labels with abbreviations
labels = [full_to_abbr.get(item.get_text(), item.get_text()) for item in ax.get_yticklabels()]
ax.set_yticklabels(labels)
# Customize plot
ax.yaxis.label.set_visible(False)
plt.xlabel('Mean delay [min] (@departure: blue, @arrival: hatch lines)',
fontsize=14, weight='bold', labelpad=10)
# Show plot
plt.show()
In this figure, it is evident that arrival delays are typically less severe than departure delays. This suggests that airlines often adjust their flight speeds to make up time and decrease the delays by the time they arrive at their destination.
Relation between the origin airport and delays¶
Starting by having a look at the number of destination airports for each airline:
print("Nb of airports: {}".format(len(flights['ORIGIN_AIRPORT'].unique())))
Nb of airports: 628
print("Nb of airports: {}".format(len(flights_2019['ORIGIN_AIRPORT'].unique())))
Nb of airports: 380
origin_nb = dict()
for carrier in abbr_companies.keys():
list_origin_airport = flights[flights['AIRLINE'] == carrier]['ORIGIN_AIRPORT'].unique()
origin_nb[carrier] = len(list_origin_airport)
test_df = pd.DataFrame.from_dict(origin_nb, orient='index')
test_df.rename(columns = {0:'count'}, inplace = True)
ax = test_df.plot(kind='bar', figsize = (8,3))
labels = [abbr_companies[item.get_text()] for item in ax.get_xticklabels()]
ax.set_xticklabels(labels)
plt.ylabel('Number of airports visited', fontsize=14, weight = 'bold', labelpad=12)
plt.setp(ax.get_xticklabels(), fontsize=11, ha = 'right', rotation = 80)
ax.legend().set_visible(False)
plt.show()
origin_nb = dict()
for carrier in abbr_companies.keys():
list_origin_airport = flights_2019[flights_2019['AIRLINE'] == carrier]['ORIGIN_AIRPORT'].unique()
origin_nb[carrier] = len(list_origin_airport)
test_df = pd.DataFrame.from_dict(origin_nb, orient='index')
test_df.rename(columns = {0:'count'}, inplace = True)
ax = test_df.plot(kind='bar', figsize = (8,3))
labels = [abbr_companies[item.get_text()] for item in ax.get_xticklabels()]
ax.set_xticklabels(labels)
plt.ylabel('Number of airports visited', fontsize=14, weight = 'bold', labelpad=12)
plt.setp(ax.get_xticklabels(), fontsize=11, ha = 'right', rotation = 80)
ax.legend().set_visible(False)
plt.show()
In this section, we will explore how delays vary depending on the origin airport and across different airlines. The initial step involves calculating the average delays for each airport:
Given the substantial number of airports involved, presenting all the data simultaneously would result in a cluttered graph, as it would encompass approximately 4400 values (i.e., 312 airports multiplied by 14 airlines). Therefore, we will only display a subset of the data:
temp = pd.read_csv('data/airports.csv')
identify_airport = temp.set_index('IATA_CODE')['CITY'].to_dict()
latitude_airport = temp.set_index('IATA_CODE')['LATITUDE'].to_dict()
longitude_airport = temp.set_index('IATA_CODE')['LONGITUDE'].to_dict()
airport_mean_delays = pd.DataFrame(pd.Series(flights['ORIGIN_AIRPORT'].unique()))
airport_mean_delays.set_index(0, drop = True, inplace = True)
for carrier in abbr_companies.keys():
df1 = flights[flights['AIRLINE'] == carrier]
test = df1['DEPARTURE_DELAY'].groupby(flights['ORIGIN_AIRPORT']).apply(get_stats).unstack()
airport_mean_delays[carrier] = test.loc[:, 'mean']
sns.set(context="paper")
fig = plt.figure(1, figsize=(8,8))
ax = fig.add_subplot(1,2,1)
subset = airport_mean_delays.iloc[:50,:].rename(columns = abbr_companies)
subset = subset.rename(index = identify_airport)
mask = subset.isnull()
sns.heatmap(subset, linewidths=0.01, cmap="Accent", mask=mask, vmin = 0, vmax = 35)
plt.setp(ax.get_xticklabels(), fontsize=10, rotation = 85) ;
ax.yaxis.label.set_visible(False)
ax = fig.add_subplot(1,2,2)
subset = airport_mean_delays.iloc[50:100,:].rename(columns = abbr_companies)
subset = subset.rename(index = identify_airport)
fig.text(0.5, 1.02, "Delays: impact of the origin airport", ha='center', fontsize = 18)
mask = subset.isnull()
sns.heatmap(subset, linewidths=0.01, cmap="Accent", mask=mask, vmin = 0, vmax = 35)
plt.setp(ax.get_xticklabels(), fontsize=10, rotation = 85) ;
ax.yaxis.label.set_visible(False)
plt.tight_layout()
plt.plot()
plt.show()
airport_mean_delays = pd.DataFrame(pd.Series(flights_2019['ORIGIN_AIRPORT'].unique()))
airport_mean_delays.set_index(0, drop = True, inplace = True)
for carrier in abbr_companies.keys():
df1 = flights_2019[flights['AIRLINE'] == carrier]
test = df1['DEPARTURE_DELAY'].groupby(flights_2019['ORIGIN_AIRPORT']).apply(get_stats).unstack()
airport_mean_delays[carrier] = test.loc[:, 'mean']
sns.set(context="paper")
fig = plt.figure(1, figsize=(8,8))
ax = fig.add_subplot(1,2,1)
subset = airport_mean_delays.iloc[:50,:].rename(columns = abbr_companies)
subset = subset.rename(index = identify_airport)
mask = subset.isnull()
sns.heatmap(subset, linewidths=0.01, cmap="Accent", mask=mask, vmin = 0, vmax = 35)
plt.setp(ax.get_xticklabels(), fontsize=10, rotation = 85) ;
ax.yaxis.label.set_visible(False)
ax = fig.add_subplot(1,2,2)
subset = airport_mean_delays.iloc[50:100,:].rename(columns = abbr_companies)
subset = subset.rename(index = identify_airport)
fig.text(0.5, 1.02, "Delays: impact of the origin airport", ha='center', fontsize = 18)
mask = subset.isnull()
sns.heatmap(subset, linewidths=0.01, cmap="Accent", mask=mask, vmin = 0, vmax = 35)
plt.setp(ax.get_xticklabels(), fontsize=10, rotation = 85) ;
ax.yaxis.label.set_visible(False)
plt.tight_layout()
plt.plot()
plt.show()